# -*- coding: utf-8 -*-
"""
This script reads multiple *.embeddings.npz files, calculates the cosine similarity
for each, and prints the statistical results to the console.
It does not generate any output files.
"""

import numpy as np

# ======== Configuration ========
INPUT_FILES = [
    r"path/to/your/file1.npz",
    r"path/to/your/file2.npz",
    r"path/to/your/file3.npz",
    r"path/to/your/file4.npz",
    r"path/to/your/file5.npz",
    r"path/to/your/file6.npz",
    # Add more file paths as needed
]


# Number of sample similarities to print; set to 0 to disable.
SHOW_SAMPLES = 0
# =========================

def process_one(npz_path: str):
    print(f"\n=== Processing {npz_path} ===")
    data = np.load(npz_path, allow_pickle=True)

    orig = data["orig_vectors"]
    rewrite = data["rewrite_vectors"]

    # Since vectors are normalized, the dot product equals cosine similarity.
    cosines = np.sum(orig * rewrite, axis=1)

    print(f"Total entries: {len(cosines)}")
    print(f"Average similarity: {cosines.mean():.4f}")
    print(f"Median: {np.median(cosines):.4f}")
    print(f"Max: {cosines.max():.4f}  Min: {cosines.min():.4f}")

    if SHOW_SAMPLES > 0:
        print(f"\nFirst {SHOW_SAMPLES} sample cosine values:")
        for c in cosines[:SHOW_SAMPLES]:
            print(f"  {c:.4f}")

if __name__ == "__main__":
    for path in INPUT_FILES:
        process_one(path)
    print("\nAll calculations are complete.")